Setting Parameters:

Preprocessing performed with Seurat and Scater. Output to that Rmd file, will visualize multiple settings for PCs to use.

## Parameters for processing dataset and metadata ##
## Cell-cycle scoring on genes:
CC_file = "../scrna/refs/CellCycle/TiroshRegev2016_CellCycleGenes.csv"
MT_genes_file = "../scrna/CMdiff_mon-EB-EHT_d4-26_all_s2s_ercc/genome_addons/MT_genes.txt"

# Chronological order of levels (for instance timepoints)
reorder_label <- "Timepoint"
order_levels <- c("d4" ,"d5","d7", "d8","d9", "d10","d11", "d12", "d13", "d14", "d18", "d21", "m5", "m11", "m12")

## Dimensionality reduction ##
# Settings for UMAP and clustering
elbow_dim_use = 55
PC_set = 50
res_set = 1.5
grouping_settings = c("seurat_clusters","Timepoint", "Lineage", "Method", "Experiment")
w_umappatch = 12
h_umappatch = 12

## Visualization ##
explore_violin = c("NKX2-5", "MYL7")
cc_goi = c("PCNA", "TOP2A", "MCM6", "MKI67")
selected_markers2 = c("GATA4","NKX2-5","ISL1","PDGFRA","RYR2","TNNT2","MYL7","PECAM1","MYL2","IRX4","COL3A1","MKI67")
other_featureprint = c("nFeature_sf","nCount_sf", "GFP", "mCherry","NKX2-5","NR2F2")
GOI_file = ""
no_of_DEgenes = 10
# Show marker heatmap with reordering of the clusters?
reorder_clusterlevels = FALSE
order_clusterlevels = c("0",  "1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9",  "10", "11", "12", "13", "14", "15", "16")

## Subsetting the dataset ##
# On CellCycle-state?
# Specify a specific (sub)string to select the cells on (selection happens on the colnames)
subset_var = "Phase"
subset_id = "G1"
# Select the type of filtering: keep cells with the substring (set "in") or remove (set "out")
# if NO filtering is wanted, leave empty (set "" or "no")
filtering = "no"
# Settings after filtering:
vars_to_regress = NULL #c("nCount_sf", "nFeature_sf") # If no regression desired: NULL
nHVG = 2000
sub_PC_set = 12
sub_res_set = 1.0


## Storing results ##
dataset_location <- "../scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/preprocess_R/20211020_ExpIntegration/seusetv4_integrated.rds"
workdir <- "../scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/clust_umap_R/"
# if regression is performed: this will already be included in the folder name
result_descript = "_ClusteringUMAP_toPy_noSubset"

## Location of scripts ##
source("../R-scripts/scRNA-seq/feature_plot_pdf_fuction.R")
source("../R-scripts/scRNA-seq/Seurat_subset_rerun.R")
source("../R-scripts/scRNA-seq/seurat_to_matrices.R")
system(paste("mkdir -p ", workdir))
knitr::opts_knit$set(root.dir = normalizePath(workdir))
set.seed(42)
# ########## Installing important repositories ##########
# # ONLY NEEDED THE FIRST TIME LOADING THIS ENVIRONMENT #
# 
# # Needed dependencies for transfer to python with H5AD type-convert:
# Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")
# devtools::install_github(repo = "mojaveazure/loomR", ref = "develop")
# install.packages("remotes")
# remotes::install_github("mojaveazure/seurat-disk")

Loading the dataset

Normalized, scaled dataset with information on the highest variable genes (HVGs).

seuset <- readRDS(paste0(dataset_location))

Dataset specific edit:

experiment_table <- read.table("../scrna/CMdiff_mon-EB-EHT_combined/sample-experiment_table.txt", header = TRUE, row.names = 1)
seuset@meta.data$Experiment <- experiment_table[seuset@meta.data$combined_id,]

Setting up results directory

dateoftoday <- gsub("-", "", as.character(Sys.Date()))
resultsdir <- paste(workdir, dateoftoday, result_descript, sep = "")
system(paste("mkdir -p ", resultsdir))
knitr::opts_knit$set(root.dir = normalizePath(resultsdir))

Results will be stored in: resultsdir

Adjusting set-up of the object

seuset@meta.data[[reorder_label]] <- factor(seuset@meta.data[[reorder_label]], levels = order_levels)
ElbowPlot(seuset, ndims = elbow_dim_use)

Cluster the cells & Non-linear dimensional reduction UMAP

Seurat v3 applies a graph-based clustering approach, in which the cells are embedded in a graph structure (for example a K-nearest neighbor graph), with edges between the cells with similar feature expression patterns (determined with euclidean distances in PCA space and overlap within local neighborhoods, as determined with Jaccard similarity). The cells are then tried to partition into quasi-cliques or communities.

Louvain clustering is used after, to iteratively group cells together

# Calculating the distances and finding the neighbors:
seuset <- FindNeighbors(seuset, dims = 1:PC_set)
## Computing nearest neighbor graph
## Computing SNN
seuset <- FindClusters(seuset, resolution = res_set)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 12706
## Number of edges: 557967
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8505
## Number of communities: 31
## Elapsed time: 1 seconds
head(Idents(seuset), 10)
##  GRCh38_ERCC_reporter_EB_AM_d10_1_942_P8 
##                                       18 
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_L15 
##                                       15 
##  GRCh38_ERCC_reporter_EB_AM_d10_1_942_D1 
##                                        9 
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_P14 
##                                       15 
##  GRCh38_ERCC_reporter_EB_AM_d10_1_942_E9 
##                                        9 
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_L11 
##                                        9 
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_B12 
##                                        9 
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_C12 
##                                       10 
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_L14 
##                                       10 
##  GRCh38_ERCC_reporter_EB_AM_d10_1_942_M2 
##                                       15 
## 31 Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 30
seuset <- RunUMAP(seuset, dims = 1:PC_set)
## 18:50:35 UMAP embedding parameters a = 0.9922 b = 1.112
## 18:50:35 Read 12706 rows and found 50 numeric columns
## 18:50:35 Using Annoy for neighbor search, n_neighbors = 30
## 18:50:35 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 18:50:36 Writing NN index file to temp file /scratch/snabel/tmp/BoiX/Rtmp01Usjr/file22b4060eb10dc
## 18:50:36 Searching Annoy index using 1 thread, search_k = 3000
## 18:50:39 Annoy recall = 100%
## 18:50:40 Commencing smooth kNN distance calibration using 1 thread
## 18:50:40 Initializing from normalized Laplacian + noise
## 18:50:41 Commencing optimization for 200 epochs, with 555434 positive edges
## 18:50:46 Optimization finished
pdf(paste0("UMAPs_PC1-", PC_set, "_", res_set,"_clusters.pdf"), width = 10, height = 7)
DimPlot(seuset, reduction = "umap", group.by = "seurat_clusters", label = TRUE, combine = TRUE)
dev.off()
## png 
##   2

Assign Cell-Cycle Scores

“First, we assign each cell a score, based on its expression of G2/M and S phase markers. These marker sets should be anticorrelated in their expression levels, and cells expressing neither are likely not cycling and in G1 phase.” - Seurat

Using the list in Seurat´s tutorials from Tirosh for their CC correction.

# Generate cell cycle gene lists
ccgenes <- read.csv(CC_file, header = TRUE, sep = ";")

g2m.genes <- as.character(ccgenes$G2.M)
s.genes <- as.character(ccgenes$G1.S)
s.genes <- s.genes[!s.genes == ""]

s.genes <- s.genes[s.genes %in% rownames(seuset)]
g2m.genes <- g2m.genes[g2m.genes %in% rownames(seuset)]
# Running scoring and dimensionality reduction with the cell cycle gene list
seuset <- CellCycleScoring(seuset, s.features = s.genes, g2m.features = g2m.genes, set.ident = FALSE)
seuset.cc <- RunPCA(seuset, features = unique(c(s.genes, g2m.genes)), npcs = 10)
## PC_ 1 
## Positive:  E2F8, UNG, WDR76, CDCA7, ATAD2, UHRF1, GMNN, CDC6, HN1, GAS2L3 
##     EXO1, GINS2, MCM2, MCM4, MCM6, DTL, FEN1, BLM, RRM1, RAD51 
##     NASP, HELLS, ECT2, AURKA, PCNA, NCAPD2, BRIP1, BIRC5, HMMR, TMPO 
## Negative:  TOP2A, CENPF, UBE2C, CDK1, NUSAP1, TPX2, MKI67, DLGAP5, CENPE, KIF2C 
##     AURKB, ANLN, TTK, NDC80, KIF23, KIF11, HMGB2, CDCA8, NUF2, GTSE1 
##     SMC4, KIF20B, HJURP, RRM2, TACC3, RAD51AP1, BUB1, CKS1B, CDCA2, CENPA 
## PC_ 2 
## Positive:  CENPA, AURKA, CDCA3, MKI67, DLGAP5, GAS2L3, HMMR, FAM64A, TPX2, CENPE 
##     BUB1, CCNB2, CKAP2L, GTSE1, CDCA8, KIF23, UBE2C, BIRC5, ECT2, AURKB 
##     CKAP2, TACC3, CENPF, NDC80, NUF2, TOP2A, KIF2C, NUSAP1, ANLN, CDCA2 
## Negative:  DTL, PCNA, MCM4, CDC6, CLSPN, UHRF1, MCM6, MCM2, HELLS, UNG 
##     FEN1, EXO1, GINS2, RAD51, BLM, CDCA7, WDR76, NASP, BRIP1, RAD51AP1 
##     GMNN, ATAD2, RRM2, TMPO, RRM1, SMC4, CDK1, E2F8, HMGB2, KIF20B 
## PC_ 3 
## Positive:  E2F8, ATAD2, AURKA, HJURP, NDC80, KIF23, CKAP2L, CDK1, RRM2, EXO1 
##     CDCA2, GMNN, CENPA, CDC6, WDR76, CDCA3, CDCA8, RAD51, KIF2C, GINS2 
##     DTL, BRIP1, ANLN, TTK, AURKB, GAS2L3, KIF11, CLSPN, NUF2, FEN1 
## Negative:  NASP, CCNB2, NCAPD2, CDCA7, MCM6, HMGB2, TMPO, BIRC5, CENPF, DLGAP5 
##     MCM2, CENPE, CKS2, HELLS, CKS1B, HMMR, HN1, UNG, TPX2, SMC4 
##     FAM64A, MKI67, KIF20B, CKAP2, ECT2, BUB1, TOP2A, BLM, UHRF1, MCM4 
## PC_ 4 
## Positive:  ATAD2, GMNN, RRM1, HN1, BIRC5, ANLN, CKAP2, ECT2, E2F8, GINS2 
##     AURKA, FEN1, CDC6, SMC4, HMMR, BUB1, CDCA3, CCNB2, FAM64A, CENPF 
##     RRM2, PCNA, MCM4, TPX2, CKS1B, CLSPN, MCM6, CENPE, NASP, MKI67 
## Negative:  HJURP, BLM, CDCA2, BRIP1, EXO1, CDK1, WDR76, CENPA, AURKB, HELLS 
##     TTK, CKAP2L, CDCA8, UBE2C, TMPO, KIF11, HMGB2, NDC80, RAD51AP1, UNG 
##     KIF2C, CDCA7, NCAPD2, NUF2, KIF23, TACC3, TOP2A, UHRF1, MCM2, KIF20B 
## PC_ 5 
## Positive:  GMNN, ECT2, CKAP2L, AURKA, CDCA2, ATAD2, CENPA, CDCA3, CDCA7, KIF23 
##     HJURP, CDK1, CKS2, MCM6, UNG, WDR76, NDC80, EXO1, FAM64A, NUF2 
##     CDCA8, UBE2C, UHRF1, CDC6, PCNA, GINS2, MCM4, HN1, KIF2C, RRM2 
## Negative:  E2F8, BIRC5, CCNB2, TMPO, NCAPD2, NASP, GTSE1, DLGAP5, HMGB2, HELLS 
##     CENPF, GAS2L3, CENPE, BRIP1, RAD51, MKI67, AURKB, CLSPN, SMC4, MCM2 
##     RAD51AP1, KIF20B, TOP2A, FEN1, ANLN, BLM, TTK, TACC3, TPX2, CKS1B
# Visualizing the effect of cell cycle in the dataset
DimPlot(seuset.cc, reduction = "pca", group.by = "Phase")

DimPlot(seuset, group.by = "Phase")

RidgePlot(seuset, features = cc_goi, ncol = 2)
## Picking joint bandwidth of 0.138
## Picking joint bandwidth of 0.0531
## Picking joint bandwidth of 0.0626
## Picking joint bandwidth of 0.0228

pl.list <- list()

for (i in c(grouping_settings, "Phase")){
  print(i)
  pl.list[[i]] <- DimPlot(seuset, reduction = "umap", group.by = i, combine = TRUE)
  print(pl.list[[i]])
}
## [1] "seurat_clusters"

## [1] "Timepoint"

## [1] "Lineage"

## [1] "Method"

## [1] "Experiment"

## [1] "Phase"

pdf(paste0("UMAPs_PC1-", PC_set, "_", res_set,".pdf"), width = w_umappatch, height = h_umappatch)
Reduce( `+`, pl.list ) +
      patchwork::plot_layout( ncol = 2 )
dev.off()
## png 
##   2

Feature plots for markers of interest

pdf(paste0("Featureplot_PC1-", PC_set, "_SelectedMarkers2.pdf"), width = 20, height = 15)
FeaturePlot(seuset, selected_markers2)
## Warning: Found the following features in more than one assay, excluding the
## default. We will not include these in the final data frame: GATA4, NKX2-5
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: GATA4, NKX2-5
dev.off()
## png 
##   2
pdf(paste0("Featureplot_PC1-", PC_set, paste(other_featureprint, collapse = "_"), ".pdf"), width = 20, height = 15)
FeaturePlot(seuset, other_featureprint)
## Warning: Found the following features in more than one assay, excluding the
## default. We will not include these in the final data frame: mCherry, NKX2-5
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: GFP, mCherry, NKX2-5
dev.off()
## png 
##   2
feature_plot_pdfs(seuset)
## Warning: Found the following features in more than one assay, excluding the
## default. We will not include these in the final data frame: ETV2, TCF21, TBX18
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: ETV2, TCF21, TBX18
## Warning: Found the following features in more than one assay, excluding the
## default. We will not include these in the final data frame: NKX2-5
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: NKX2-5
## Warning: Found the following features in more than one assay, excluding the
## default. We will not include these in the final data frame: GATA4, NKX2-5
## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: GATA4, NKX2-5
## png 
##   2

Finding differentially expressed features (cluster Biomarkers)

seuset.markers <- FindAllMarkers(seuset, only.pos =TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
## Calculating cluster 23
## Calculating cluster 24
## Calculating cluster 25
## Calculating cluster 26
## Calculating cluster 27
## Calculating cluster 28
## Calculating cluster 29
## Calculating cluster 30
write.csv(seuset.markers, paste0("all_seuset_markers_PC1-", PC_set,"_res", res_set,".csv"), quote = FALSE)
top_markers <- seuset.markers %>% group_by(cluster) %>% top_n(n = no_of_DEgenes, wt = avg_log2FC)

DoHeatmap(seuset, features = top_markers$gene, group.by = "seurat_clusters") + NoLegend()

pdf(paste0("ClusterHeatmapTop", no_of_DEgenes, "_PC1-", PC_set,"_res", res_set, "_UMAP.pdf"), width = 20, height = 20)
DoHeatmap(seuset, features = top_markers$gene, group.by = "seurat_clusters") + NoLegend() +
  theme(text = element_text(size = 8))
dev.off()
## png 
##   2
if (reorder_clusterlevels == TRUE){ 
  # Set the order of clusters for visualization:
  old_levels <- levels(seuset$seurat_clusters)
  levels(seuset$seurat_clusters) <- order_clusterlevels
  
  pdf(paste0("ClusterHeatmapTop", no_of_DEgenes, "_PC1-", PC_set,"_res", res_set, "_UMAP_reordered.pdf"), width = 20, height = 20)
  DoHeatmap(seuset, features = top_markers$gene, group.by = "seurat_clusters") + NoLegend() + theme(text = element_text(size = 8))
  dev.off()
  
  # # reset level order:
  #levels(seuset$seurat_clusters) <- old_levels
  }
saveRDS(seuset, paste0("seusetv4_PC1-", PC_set, "_res", res_set, ".rds" ))

Writing Seurat object as separate matrices

seurat_to_matrices(seurat_object = seuset, output_folder = resultsdir, integration_object = TRUE)
## [1] "Generating files at: ../scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/clust_umap_R/20211021_ClusteringUMAP_toPy_noSubset/ for assay: sf, a counts.mtx and scaled_counts.csv"
## [1] "Generating files at: ../scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/clust_umap_R/20211021_ClusteringUMAP_toPy_noSubset/ for assay: uf, a counts.mtx and scaled_counts.csv"
## [1] "Generating files at: ../scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/clust_umap_R/20211021_ClusteringUMAP_toPy_noSubset/ for assay: integrated, a counts.mtx and scaled_counts.csv"
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /vol/mbconda/snabel/anaconda3/envs/kb_scrna_R_mon3/lib/libopenblasp-r0.3.17.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.1.1    Matrix_1.3-4       scales_1.1.1       RColorBrewer_1.1-2
##  [5] SeuratObject_4.0.2 Seurat_4.0.4       knitr_1.33         tidyr_1.1.3       
##  [9] dplyr_1.0.7        ggplot2_3.3.5     
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15            colorspace_2.0-2      deldir_0.2-10        
##   [4] ellipsis_0.3.2        ggridges_0.5.3        spatstat.data_2.1-0  
##   [7] farver_2.1.0          leiden_0.3.9          listenv_0.8.0        
##  [10] ggrepel_0.9.1         RSpectra_0.16-0       fansi_0.5.0          
##  [13] codetools_0.2-18      splines_4.1.0         polyclip_1.10-0      
##  [16] jsonlite_1.7.2        ica_1.0-2             cluster_2.1.2        
##  [19] png_0.1-7             uwot_0.1.10           shiny_1.6.0          
##  [22] sctransform_0.3.2     spatstat.sparse_2.0-0 compiler_4.1.0       
##  [25] httr_1.4.2            assertthat_0.2.1      fastmap_1.1.0        
##  [28] lazyeval_0.2.2        cli_3.0.1             limma_3.48.3         
##  [31] later_1.3.0           htmltools_0.5.2       tools_4.1.0          
##  [34] igraph_1.2.6          gtable_0.3.0          glue_1.4.2           
##  [37] RANN_2.6.1            reshape2_1.4.4        Rcpp_1.0.7           
##  [40] scattermore_0.7       jquerylib_0.1.4       vctrs_0.3.8          
##  [43] nlme_3.1-152          lmtest_0.9-38         xfun_0.25            
##  [46] stringr_1.4.0         globals_0.14.0        mime_0.11            
##  [49] miniUI_0.1.1.1        lifecycle_1.0.0       irlba_2.3.3          
##  [52] goftest_1.2-2         future_1.22.1         MASS_7.3-54          
##  [55] zoo_1.8-9             spatstat.core_2.3-0   promises_1.2.0.1     
##  [58] spatstat.utils_2.2-0  parallel_4.1.0        yaml_2.2.1           
##  [61] reticulate_1.20       pbapply_1.4-3         gridExtra_2.3        
##  [64] sass_0.4.0            rpart_4.1-15          stringi_1.7.4        
##  [67] highr_0.9             rlang_0.4.11          pkgconfig_2.0.3      
##  [70] matrixStats_0.60.1    evaluate_0.14         lattice_0.20-44      
##  [73] ROCR_1.0-11           purrr_0.3.4           tensor_1.5           
##  [76] labeling_0.4.2        htmlwidgets_1.5.3     cowplot_1.1.1        
##  [79] tidyselect_1.1.1      parallelly_1.27.0     RcppAnnoy_0.0.19     
##  [82] plyr_1.8.6            magrittr_2.0.1        R6_2.5.1             
##  [85] generics_0.1.0        DBI_1.1.1             pillar_1.6.2         
##  [88] withr_2.4.2           mgcv_1.8-36           fitdistrplus_1.1-5   
##  [91] survival_3.2-13       abind_1.4-5           tibble_3.1.4         
##  [94] future.apply_1.8.1    crayon_1.4.1          KernSmooth_2.23-20   
##  [97] utf8_1.2.2            spatstat.geom_2.2-2   plotly_4.9.4.1       
## [100] rmarkdown_2.10        grid_4.1.0            data.table_1.14.0    
## [103] digest_0.6.27         xtable_1.8-4          httpuv_1.6.2         
## [106] munsell_0.5.0         viridisLite_0.4.0     bslib_0.3.0